{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tutorial of hdfs python library"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "References:\n",
    "- [PyHive Documentation](https://github.com/dropbox/PyHive)\n",
    "- [Using Hive External Tables](https://docs.cloudera.com/HDPDocuments/HDP3/HDP-3.1.5/using-hiveql/content/hive_create_an_external_table.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install hdfs library from pyPi repo:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install --user pyhive"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "References:\n",
    "- [PyHive Documentation](https://github.com/dropbox/PyHive)\n",
    "- [Using Hive External Tables](https://docs.cloudera.com/HDPDocuments/HDP3/HDP-3.1.5/using-hiveql/content/hive_create_an_external_table.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip show parquet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## HDFS DATASET\n",
    "Subimos el dataset a hdfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('world-cities_csv.csv',\n",
       "  {'accessTime': 1613144968999,\n",
       "   'blockSize': 134217728,\n",
       "   'childrenNum': 0,\n",
       "   'fileId': 16396,\n",
       "   'group': 'supergroup',\n",
       "   'length': 895586,\n",
       "   'modificationTime': 1613144969870,\n",
       "   'owner': 'hdfs',\n",
       "   'pathSuffix': 'world-cities_csv.csv',\n",
       "   'permission': '755',\n",
       "   'replication': 3,\n",
       "   'storagePolicy': 0,\n",
       "   'type': 'FILE'})]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from hdfs import InsecureClient\n",
    "hdfs_client = InsecureClient('http://hive-hdfs:50070', user='hdfs')\n",
    "if not(hdfs_client.status('/datasets/world_cities', strict=False)):\n",
    "    # Download a file or folder locally.\n",
    "    hdfs_client.makedirs('/datasets/world_cities')\n",
    "    hdfs_client.upload('/datasets/world_cities','world-cities_csv.csv', n_threads=5)\n",
    "\n",
    "hdfs_client.list('/datasets/world_cities',status=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyhive import hive\n",
    "conn = hive.Connection(host=\"hive-server\", port=10000, username=\"hive\")\n",
    "cursor = conn.cursor()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Consultas SQL con Hive\n",
    "\n",
    "Directamente sobre los archivos csv almacenados en hdfs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Crear base de datos de prueba en el metastore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('default',), ('hive_tutorial',)]\n"
     ]
    }
   ],
   "source": [
    "cursor.execute(\"CREATE DATABASE IF NOT EXISTS hive_tutorial\")\n",
    "cursor.execute(\"USE hive_tutorial\")\n",
    "\n",
    "conn.commit()\n",
    "cursor.execute(\"SHOW DATABASES\")\n",
    "conn.commit()\n",
    "print(cursor.fetchall())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute(\"DROP table world_cities\")\n",
    "query = \"\"\"\n",
    "CREATE EXTERNAL TABLE IF NOT EXISTS world_cities (\n",
    "        name STRING,\n",
    "        country STRING,\n",
    "        subcountry STRING,\n",
    "        geonameid INT)\n",
    "    COMMENT 'database with information about world cities'\n",
    "    ROW FORMAT DELIMITED\n",
    "    FIELDS TERMINATED BY ','\n",
    "    STORED AS TEXTFILE\n",
    "    location 'hdfs://hive-hdfs/datasets/world_cities/'\n",
    "    tblproperties (\"skip.header.line.count\"=\"1\")\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "cursor.execute(query)\n",
    "conn.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('world_cities',)]\n"
     ]
    }
   ],
   "source": [
    "cursor.execute(\"SHOW TABLES\")\n",
    "conn.commit()\n",
    "print(cursor.fetchall())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetching Total  10  rows\n",
      "Printing each row\n",
      "('les Escaldes', 'Andorra', 'Escaldes-Engordany', 3040051)\n",
      "('Andorra la Vella', 'Andorra', 'Andorra la Vella', 3041563)\n",
      "('Umm al Qaywayn', 'United Arab Emirates', 'Umm al Qaywayn', 290594)\n",
      "('Ras al-Khaimah', 'United Arab Emirates', 'Raʼs al Khaymah', 291074)\n",
      "('Khawr Fakkān', 'United Arab Emirates', 'Ash Shāriqah', 291696)\n",
      "('Dubai', 'United Arab Emirates', 'Dubai', 292223)\n",
      "('Dibba Al-Fujairah', 'United Arab Emirates', 'Al Fujayrah', 292231)\n",
      "('Dibba Al-Hisn', 'United Arab Emirates', 'Al Fujayrah', 292239)\n",
      "('Sharjah', 'United Arab Emirates', 'Ash Shāriqah', 292672)\n",
      "('Ar Ruways', 'United Arab Emirates', 'Abu Dhabi', 292688)\n"
     ]
    }
   ],
   "source": [
    "cursor.execute(\"SELECT * FROM world_cities limit 100\")\n",
    "conn.commit()\n",
    "size = 10\n",
    "records = cursor.fetchmany(size)\n",
    "print(\"Fetching Total \", size,\" rows\")\n",
    "print(\"Printing each row\")\n",
    "for row in records:\n",
    "    print(row)\n",
    "cursor.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fetching Total  10  rows\n",
      "Printing each row\n",
      "('les Escaldes',)\n",
      "('Andorra la Vella',)\n",
      "('Umm al Qaywayn',)\n",
      "('Ras al-Khaimah',)\n",
      "('Khawr Fakkān',)\n",
      "('Dubai',)\n",
      "('Dibba Al-Fujairah',)\n",
      "('Dibba Al-Hisn',)\n",
      "('Sharjah',)\n",
      "('Ar Ruways',)\n"
     ]
    }
   ],
   "source": [
    "cursor.execute(\"SELECT name FROM world_cities\")\n",
    "conn.commit()\n",
    "size = 10\n",
    "records = cursor.fetchmany(size)\n",
    "print(\"Fetching Total \", size,\" rows\")\n",
    "print(\"Printing each row\")\n",
    "for row in records:\n",
    "    print(row)\n",
    "cursor.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
